xxxxxxxxxx7
1
begin2
using ReinforcementLearning3
using Flux4
using Statistics5
using Plots6
using StatsBase7
endLeft Right Environment
xxxxxxxxxx34
1
begin2
Base. mutable struct LeftRightEnv <: AbstractEnv3
reward::Float64 = 0.4
current_state::Int = 15
end6
7
RLBase.state_space(env::LeftRightEnv) = Base.OneTo(2)8
RLBase.action_space(env::LeftRightEnv) = Base.OneTo(2)9
10
function (env::LeftRightEnv)(a::Int)11
if a == 212
env.reward = 0.13
env.current_state = 214
else15
s = sample(Weights([0.9, 0.1], 1.0))16
if s == 117
env.reward = 0.18
env.current_state = 119
else20
env.reward = 1.021
env.current_state = 222
end23
end24
end25
26
function RLBase.reset!(env::LeftRightEnv)27
env.current_state = 128
env.reward = 0.29
end30
31
RLBase.reward(env::LeftRightEnv) = env.reward32
RLBase.is_terminated(env::LeftRightEnv) = env.current_state == 233
RLBase.state(env::LeftRightEnv) = env.current_state34
end# LeftRightEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(2)`
## Action Space
`Base.OneTo(2)`
## Current State
```
1
```
xxxxxxxxxx1
1
world = LeftRightEnv()2xxxxxxxxxx4
1
begin2
ns = length(state_space(world))3
na = length(action_space(world))4
end VBasedPolicy
├─ learner => MonteCarloLearner
│ ├─ approximator
│ │ ├─ 1
│ │ │ └─ TabularApproximator
│ │ │ ├─ table => 2-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 1.0
│ │ └─ 2
│ │ └─ TabularApproximator
│ │ ├─ table => 2-element Array{Float64,1}
│ │ └─ optimizer => InvDecay
│ │ ├─ gamma => 1.0
│ │ └─ state => IdDict
│ ├─ γ => 1.0
│ ├─ kind => ReinforcementLearningZoo.FirstVisit
│ └─ sampling => ReinforcementLearningZoo.OrdinaryImportanceSampling
└─ mapping => Main.var"#1#2"
xxxxxxxxxx12
1
π_t = VBasedPolicy(2
learner=MonteCarloLearner(3
approximator=(4
TabularVApproximator(;n_state=ns, opt=Descent(1.0)), # V5
TabularVApproximator(;n_state=ns, opt=InvDecay(1.0)) # Returns6
),7
kind=FIRST_VISIT,8
sampling=ORDINARY_IMPORTANCE_SAMPLING,9
γ=1.010
),11
mapping= (env, V) -> 112
)xxxxxxxxxx2
1
# A little ad-hoc here2
RLBase.prob(::typeof(π_t), s, a) = a == 1 ? 1.0 : 0.xxxxxxxxxx4
1
struct CollectValue <: AbstractHook2
values::Vector{Float64}3
CollectValue() = new([])4
endxxxxxxxxxx1
1
(f::CollectValue)(::PostEpisodeStage, agent, env) = push!(f.values, agent.policy.π_target.learner.approximator[2](1))